General

Alignment summary

datadir = "/Volumes/test_data"

sampleList = c("Diagenode_C15410196_1_50", "CST9733_1_100_H3K27me3")

alignResult = c()
for(sample in sampleList){
  alignRes = read.table(paste0(datadir, "/alignment/sam/bowtie2_summary/", sample, "_bowtie2.txt"), header = FALSE, fill = TRUE)
  alignRate = substr(alignRes$V1[6], 1, nchar(as.character(alignRes$V1[6]))-1)
  alignResult = data.frame(Sample = sample, SequencingDepth = alignRes$V1[1] %>% as.character %>% as.numeric, 
                           MappedFragNum_hg19 = alignRes$V1[4] %>% as.character %>% as.numeric + alignRes$V1[5] %>% as.character %>% as.numeric,
                           AlignmentRate_hg19 = alignRate %>% as.numeric) %>% rbind(alignResult, .)
}

alignResult %>% mutate(AlignmentRate_hg19 = paste0(AlignmentRate_hg19, "%"))

Duplicates

dupResult = c()
for(sample in sampleList){
  dupRes = read.table(paste0(datadir, "/removeDuplicate/picard_summary/", sample, "_picard.rmDup.txt"), header = TRUE, fill = TRUE)
  dupResult = data.frame(Sample = sample, MappedFragNum_hg19 = dupRes$READ_PAIRS_EXAMINED[1] %>% as.character %>% as.numeric, DuplicationRate = dupRes$PERCENT_DUPLICATION[1] %>% as.character %>% as.numeric * 100) %>% mutate(UniqueFragNum = (MappedFragNum_hg19 * (1-DuplicationRate/100)) %>% round()) %>% rbind(dupResult, .)
}

alignDupSummary = left_join(alignResult, dupResult, by = c("Sample", "MappedFragNum_hg19")) %>% mutate(AlignmentRate_hg19 = paste0(AlignmentRate_hg19, "%"))
alignDupSummary

Fragment sizes

Without duplicates

fragLen = c()
for(sample in sampleList){
  fragLen = read.table(paste0(datadir, "/alignment/sam/fragmentLen/", sample, "_rmDup_fragmentLen.txt"), header = FALSE) %>% mutate(fragLen = V1 %>% as.numeric, fragCount = V2 %>% as.numeric, Weight = as.numeric(V2)/sum(as.numeric(V2)), Sample = sample) %>% rbind(fragLen, .) 
}

fig1 = fragLen %>% ggplot(aes(x = Sample, y = fragLen, weight = Weight, fill = Sample)) +
  theme_bw(base_size = 15) +
  geom_violin(bw = 5) +
  scale_y_continuous(breaks = seq(0, 800, 50)) +
  ggpubr::rotate_x_text(angle = 20) +
  ylab("Fragment Length") +
  xlab("") +
  theme(legend.position = "bottom")
    
fig2 = fragLen %>% ggplot(aes(x = fragLen, y = fragCount, color = Sample, group = Sample)) +
  theme_bw(base_size = 15) +
  geom_line(size = 1) +
  xlab("Fragment Length") +
  ylab("Count") +
  coord_cartesian(xlim = c(0, 500)) 

ggarrange(fig1, fig2, ncol = 2)

With duplicates

fragLen = c()
for(sample in sampleList){
  fragLen = read.table(paste0(datadir, "/alignment/sam/fragmentLen/", sample, "_withDup_fragmentLen.txt"), header = FALSE) %>% mutate(fragLen = V1 %>% as.numeric, fragCount = V2 %>% as.numeric, Weight = as.numeric(V2)/sum(as.numeric(V2)), Sample = sample) %>% rbind(fragLen, .) 
}

fig3 = fragLen %>% ggplot(aes(x = Sample, y = fragLen, weight = Weight, fill = Sample)) +
    theme_bw(base_size = 15) +
    geom_violin(bw = 5) +
    scale_y_continuous(breaks = seq(0, 800, 50)) +
    ggpubr::rotate_x_text(angle = 20) +
    ylab("Fragment Length") +
    xlab("") +
    theme(legend.position = "bottom")
    
fig4 = fragLen %>% ggplot(aes(x = fragLen, y = fragCount, color = Sample, group = Sample)) +
    theme_bw(base_size = 15) +
    geom_line(size = 1) +
    xlab("Fragment Length") +
    ylab("Count") +
    coord_cartesian(xlim = c(0, 500))

ggarrange(fig3, fig4, ncol = 2)

Peak information

# peak names simplified and all files moved to one directory

sampleList_all = c()
peak_caller = c("SEACR", "MACS2")

for(sample in sampleList){
  for(caller in peak_caller){
    name = paste0(sample, "_", caller)
    sampleList_all = c(sampleList_all, name)
  }
}


hg19_blacklist = ChIPseeker::readPeakFile(paste0(datadir, "/resources/hg19/hg19-blacklist.v2.bed.gz"), as = "GRanges")

blacklist_counts = c()
peakList = list()
peakList_unfilt = list()
for(sample in sampleList_all){
  peak_unfilt = ChIPseeker::readPeakFile(paste0(datadir, "/peakCalling/all_peaks/", sample, ".bed"), as = "GRanges")
  peakList_unfilt = c(peakList_unfilt, peak_unfilt)
  blacklist_count = length(IRanges::subsetByOverlaps(peak_unfilt, hg19_blacklist))
  peak_filt = IRanges::subsetByOverlaps(peak_unfilt, hg19_blacklist, invert = TRUE) %>% BRGenomics::tidyChromosomes(keep.X = TRUE, keep.Y = TRUE)
  peakList = c(peakList, peak_filt)
  blacklist_counts = c(blacklist_counts, blacklist_count)
}

names(peakList) = sampleList_all
names(peakList_unfilt) = sampleList_all
sampleList_all_withDup = c()
for(sample in sampleList_all){
  name = paste0(sample, "_", "withDup")
  sampleList_all_withDup = c(sampleList_all_withDup, name)
}

peakList_withDup = list()
peakList_withDup_unfilt = list()
for(sample in sampleList_all_withDup){
  peak_unfilt = ChIPseeker::readPeakFile(paste0(datadir, "/peakCalling/all_peaks/", sample, ".bed"), as = "GRanges")
  peakList_withDup_unfilt = c(peakList_withDup_unfilt, peak_unfilt)
  peak_filt = IRanges::subsetByOverlaps(peak_unfilt, hg19_blacklist, invert = TRUE) %>% BRGenomics::tidyChromosomes(keep.X = TRUE, keep.Y = TRUE)
  peakList_withDup = c(peakList_withDup, peak_filt)
}

names(peakList_withDup) = sampleList_all_withDup
names(peakList_withDup_unfilt) = sampleList_all_withDup

Peak numbers

ENCODE_H3K27ac = ChIPseeker::readPeakFile(paste0(datadir, "/resources/ENCODE/peaks/H3K27ac_ENCFF044JNJ.bed.narrowPeak"), as = "GRanges")
ENCODE_H3K27me3 = ChIPseeker::readPeakFile(paste0(datadir, "/resources/ENCODE/peaks/H3K27me3_ENCFF000BXB.bed.broadPeak"), as = "GRanges")

peakList_with_ENCODE = c(peakList, ENCODE_H3K27ac, ENCODE_H3K27me3)
sampleList_with_ENCODE = c(sampleList, "ENCODE_H3K27ac", "ENCODE_H3K27me3")
sampleList_all_with_ENCODE = c(sampleList_all, "ENCODE_H3K27ac", "ENCODE_H3K27me3")
peakList_with_ENCODE = setNames(peakList_with_ENCODE, sampleList_all_with_ENCODE)
Total_CT_peaks = c()
Total_CT_peaks_withDup = c()
for(i in 1:length(peakList)){
  Total_CT_peaks = c(Total_CT_peaks, length(peakList[[i]]))
  Total_CT_peaks_withDup = c(Total_CT_peaks_withDup, length(peakList_withDup[[i]]))
}

Total_CT_peaks = data.frame(Sample = sampleList_all, Total_PeakN = Total_CT_peaks, Total_PeakN_withDup = Total_CT_peaks_withDup)
Total_CT_peaks

Blacklisted peaks

blacklisted = data.frame(Sample = sampleList_all, Total_peaks = Total_CT_peaks$Total_PeakN, Blacklisted_peaks = blacklist_counts)
blacklisted$Proportion = blacklisted$Blacklisted_peaks / blacklisted$Total_peaks * 100
blacklisted

Peak widths

peakWidth = c()
for(peak in peakList){
  peakW = GenomicRanges::width(peak) %>% summary() %>% round() %>% list()
  peakWidth = c(peakWidth, peakW)
}

WSummary = c()
for(i in 1:length(peakWidth)){
  WSummary = data.frame(Sample = sampleList_all[i], MinW = array(peakWidth[[i]][1]), MedianW = array(peakWidth[[i]][3]), MeanW = array(peakWidth[[i]][4]), MaxW = array(peakWidth[[i]][6])) %>% rbind(., WSummary)
}

WSummary

Fractions of reads in peaks

calculate_frips = function(sample_list, peak_list, dup){
  inPeakData_df = c()
  
  for(i in 1:length(sample_list)){
    
    # to use single peak files
    if(length(peak_list) > 1){
      peakRes = data.frame(peak_list[[i]])
    } else {
      peakRes = data.frame(peak_list[[1]])
    }

    sample = sample_list[i]
    peak.gr = GenomicRanges::GRanges(seqnames = peakRes$seqnames, IRanges(start = peakRes$start, end = peakRes$end), strand = "*")
    bamFile = paste0(datadir, "/alignment/bam/", sample, "_", dup, "_bowtie2.mapped.bam")
    fragment_counts = chromVAR::getCounts(bamFile, peak.gr, paired = TRUE, by_rg = FALSE, format = "bam")
    inPeakN = counts(fragment_counts)[,1] %>% sum
    inPeakData_df = rbind(inPeakData_df, data.frame(Sample = sample, inPeakN = inPeakN))
  }
  
  return(inPeakData_df)
}
# use original peaks (unfiltered)
SEACR_peakList_unfilt = peakList_unfilt[seq(1, length(peakList_unfilt), by = 2)]
MACS2_peakList_unfilt = peakList_unfilt[seq(2, length(peakList_unfilt), by = 2)]
SEACR_peakList_unfilt_withDup = peakList_withDup_unfilt[seq(1, length(peakList_withDup_unfilt), by = 2)]
MACS2_peakList_unfilt_withDup = peakList_withDup_unfilt[seq(1, length(peakList_withDup_unfilt), by = 2)]

SEACR peaks

SEACR_noDup_inPeakData = calculate_frips(sampleList, SEACR_peakList_unfilt, "rmDup")
write.table(SEACR_noDup_inPeakData, file = paste0(datadir, "/analysis/results/FRiPs/SEACR_noDup_inPeakData_2.csv"), sep = ",", col.names = TRUE, row.names = FALSE)
SEACR_withDup_inPeakData = calculate_frips(sampleList, SEACR_peakList_unfilt_withDup, "withDup")
#write.table(SEACR_withDup_inPeakData, file = paste0(datadir, "/analysis/results/FRiPs/SEACR_withDup_inPeakData.csv"), sep = ",", col.names = TRUE, row.names = FALSE)

MACS2 peaks

MACS2_noDup_inPeakData = calculate_frips(sampleList, MACS2_peakList_unfilt, "rmDup")
#write.table(MACS2_noDup_inPeakData, file = paste0(datadir, "/analysis/results/FRiPs/MACS2_noDup_inPeakData.csv"), sep = ",", col.names = TRUE, row.names = FALSE)
MACS2_withDup_inPeakData = calculate_frips(sampleList, MACS2_peakList_unfilt_withDup, "withDup")
#write.table(MACS2_withDup_inPeakData, file = paste0(datadir, "/analysis/results/FRiPs/MACS2_withDup_inPeakData.csv"), sep = ",", col.names = TRUE, row.names = FALSE)

ENCODE H3K27ac peaks

ENCODE_H3K27ac_noDup_inPeakData = calculate_frips(sampleList, list(ENCODE_H3K27ac), "rmDup")
#write.table(ENCODE_H3K27ac_noDup_inPeakData, file = paste0(datadir, "/analysis/results/FRiPs/ENCODE_H3K27ac_noDup_inPeakData.csv"), sep = ",", col.names = TRUE, row.names = FALSE)
ENCODE_H3K27ac_withDup_inPeakData = calculate_frips(sampleList, list(ENCODE_H3K27ac), "withDup")
#write.table(ENCODE_H3K27ac_withDup_inPeakData, file = paste0(datadir, "/analysis/results/FRiPs/ENCODE_H3K27ac_withDup_inPeakData.csv"), sep = ",", col.names = TRUE, row.names = FALSE)

ENCODE H3K27me3 peaks

ENCODE_H3K27me3_noDup_inPeakData = calculate_frips(sampleList, list(ENCODE_H3K27me3), "rmDup")
#write.table(ENCODE_H3K27me3_noDup_inPeakData, file = paste0(datadir, "/analysis/results/FRiPs/ENCODE_H3K27me3_noDup_inPeakData.csv"), sep = ",", col.names = TRUE, row.names = FALSE)
ENCODE_H3K27me3_withDup_inPeakData = calculate_frips(sampleList, list(ENCODE_H3K27me3), "withDup")
#write.table(ENCODE_H3K27me3_withDup_inPeakData, file = paste0(datadir, "/analysis/results/FRiPs/ENCODE_H3K27me3_withDup_inPeakData.csv"), sep = ",", col.names = TRUE, row.names = FALSE)

Summary

# save and load counts to speed up processing
SEACR_noDup_inPeakData = read.csv(paste0(datadir, "/analysis/results/FRiPs/SEACR_noDup_inPeakData.csv"))
SEACR_withDup_inPeakData = read.csv(paste0(datadir, "/analysis/results/FRiPs/SEACR_withDup_inPeakData.csv"))
MACS2_noDup_inPeakData = read.csv(paste0(datadir, "/analysis/results/FRiPs/MACS2_noDup_inPeakData.csv"))
MACS2_withDup_inPeakData = read.csv(paste0(datadir, "/analysis/results/FRiPs/MACS2_withDup_inPeakData.csv"))
ENCODE_H3K27ac_noDup_inPeakData = read.csv(paste0(datadir, "/analysis/results/FRiPs/ENCODE_H3K27ac_noDup_inPeakData.csv"))
ENCODE_H3K27ac_withDup_inPeakData = read.csv(paste0(datadir, "/analysis/results/FRiPs/ENCODE_H3K27ac_withDup_inPeakData.csv"))
ENCODE_H3K27me3_noDup_inPeakData = read.csv(paste0(datadir, "/analysis/results/FRiPs/ENCODE_H3K27me3_noDup_inPeakData.csv"))
ENCODE_H3K27me3_withDup_inPeakData = read.csv(paste0(datadir, "/analysis/results/FRiPs/ENCODE_H3K27me3_withDup_inPeakData.csv"))
FRiP_data = alignDupSummary[c("Sample", "MappedFragNum_hg19", "UniqueFragNum")] %>%
  left_join(SEACR_noDup_inPeakData, by = "Sample") %>% 
  left_join(SEACR_withDup_inPeakData, by = "Sample") %>% 
  left_join(MACS2_noDup_inPeakData, by = "Sample") %>% 
  left_join(MACS2_withDup_inPeakData, by = "Sample") %>% 
  left_join(ENCODE_H3K27ac_noDup_inPeakData, by = "Sample") %>% 
  left_join(ENCODE_H3K27ac_withDup_inPeakData, by = "Sample") %>% 
  left_join(ENCODE_H3K27me3_noDup_inPeakData, by = "Sample") %>% 
  left_join(ENCODE_H3K27me3_withDup_inPeakData, by = "Sample")

colnames(FRiP_data) = c("Sample", "MappedFragNum_hg19", "UniqueFragNum", "SEACR_noDup_inPeakN", "SEACR_withDup_inPeakN", "MACS2_noDup_inPeakN", "MACS2_withDup_inPeakN", "ENCODE_H3K27ac_noDup_inPeakN", "ENCODE_H3K27ac_withDup_inPeakN", "ENCODE_H3K27me3_noDup_inPeakN", "ENCODE_H3K27me3_withDup_inPeakN")

FRiP_data$SEACR_noDup_FRiP = FRiP_data$SEACR_noDup_inPeakN/FRiP_data$UniqueFragNum * 100
FRiP_data$SEACR_withDup_FRiP = FRiP_data$SEACR_withDup_inPeakN/FRiP_data$MappedFragNum_hg19 * 100

FRiP_data$MACS2_noDup_FRiP = FRiP_data$MACS2_noDup_inPeakN/FRiP_data$UniqueFragNum * 100
FRiP_data$MACS2_withDup_FRiP = FRiP_data$MACS2_withDup_inPeakN/FRiP_data$MappedFragNum_hg19 * 100

FRiP_data$ENCODE_H3K27ac_noDup_FRiP = FRiP_data$ENCODE_H3K27ac_noDup_inPeakN/FRiP_data$UniqueFragNum * 100
FRiP_data$ENCODE_H3K27ac_withDup_FRiP = FRiP_data$ENCODE_H3K27ac_withDup_inPeakN/FRiP_data$MappedFragNum_hg19 * 100

FRiP_data$ENCODE_H3K27me3_noDup_FRiP = FRiP_data$ENCODE_H3K27me3_noDup_inPeakN/FRiP_data$UniqueFragNum * 100
FRiP_data$ENCODE_H3K27me3_withDup_FRiP = FRiP_data$ENCODE_H3K27me3_withDup_inPeakN/FRiP_data$MappedFragNum_hg19 * 100

FRiP_data
FRiP_summary = FRiP_data[c("Sample", "SEACR_noDup_FRiP", "SEACR_withDup_FRiP", "MACS2_noDup_FRiP", "MACS2_withDup_FRiP", "ENCODE_H3K27ac_noDup_FRiP", "ENCODE_H3K27ac_withDup_FRiP", "ENCODE_H3K27me3_noDup_FRiP", "ENCODE_H3K27me3_withDup_FRiP")]
FRiP_summary_melt = melt(FRiP_summary, id.vars = c("Sample"))

FRiP_plot = ggplot(FRiP_summary_melt, aes(Sample, value)) +
  theme_bw(base_size = 20) +
  geom_bar(aes(fill = variable), width = 0.6, position = position_dodge(width = 0.8), stat = "identity", colour = "black") +  
  ylim(0, 100) +
  ylab("% of fragments in peaks") +
  xlab("Sample") +
  ggtitle("Fragments in sample and ENCODE peaks") +
  theme(legend.title = element_blank()) +
  ggpubr::rotate_x_text(angle = 45) +
  theme(legend.position = "bottom") +
  scale_fill_manual(labels = c("SEACR noDup", "SEACR withDup", "MACS2 noDup", "MACS2 withDup", "ENCODE H3K27ac noDup", "ENCODE H3K27ac withDup", "ENCODE H3K27me3 noDup", "ENCODE H3K27me3 withDup"), values = c("violetred", "violet", "firebrick2", "orange1", "springgreen4", "chartreuse2", "royalblue2", "cyan3"))

FRiP_plot

Fingerprint plots

Cumulative read enrichment across 1000bp genomic bins, scaled to 2M reads.

Fingerprint

Sample correlation

Read-level

Across 500bp genome-wide bins

counts = read.csv(paste0(datadir, "/correlation/hg19_500bp_overlaps_counts.txt"), sep = "\t", header = FALSE)
colnames(counts) = c("chr", "start", "end", sampleList_with_ENCODE)
# same order as bam list
counts_corr = counts[4:ncol(counts)] %>% as.matrix() %>% preprocessCore::normalize.quantiles() %>% round() %>% cor(method = "pearson")
colnames(counts_corr) = sampleList_with_ENCODE
rownames(counts_corr) = sampleList_with_ENCODE

counts_corr[counts_corr <= 0] = 0
counts_rounded = counts_corr %>% round(2)
heatmap.2(x = counts_rounded, col = "viridis", trace = "none", density.info = "none", cexRow = 1.5, cexCol = 1.5, notecol = "black", lhei = c(2, 11), lwid = c(2, 8), margins = c(20, 20))

Across ENCODE H3K27ac peak ranges

counts = read.csv(paste0(datadir, "/correlation/H3K27ac_ENCFF044JNJ_overlaps_counts.txt"), sep = "\t", header = FALSE)
colnames(counts) = c("chr", "start", "end", "name", "score", "strand", "signalValue", "pValue", "qValue", "peak", sampleList_with_ENCODE)
# same order as bam list
counts_corr = counts[11:ncol(counts)] %>% as.matrix() %>% preprocessCore::normalize.quantiles() %>% round() %>% cor(method = "pearson")
colnames(counts_corr) = sampleList_with_ENCODE
rownames(counts_corr) = sampleList_with_ENCODE

counts_corr[counts_corr <= 0] = 0
counts_rounded = counts_corr %>% round(2)
heatmap.2(x = counts_rounded, col = "viridis", trace = "none", density.info = "none", cexRow = 1.5, cexCol = 1.5, notecol = "black", lhei = c(2, 11), lwid = c(2, 8), margins = c(20, 20))

Peak-level

DBA_mixed = NULL

for(i in 1:length(sampleList_all_with_ENCODE)){
   DBA_mixed = DiffBind::dba.peakset(DBA = DBA_mixed, peaks = peakList_with_ENCODE[[i]], sampID = sampleList_all_with_ENCODE[i], peak.format = "bed")
}

All peaks

DiffBind::dba.plotHeatmap(DBA_mixed, lhei = c(1,5), lwid = c(1,5), keysize = 0.75, cexRow = 1.25, cexCol = 1.25, margin = 20)

CUT&Tag vs ENCODE H3K27ac / H3K27me3 peak overlap

C&T peaks in ENCODE

count_overlaps = function(peak_list, reference){
  
  sample_peaks_in_ref = c()
  
  for(i in 1:length(peak_list)){
    overlaps_count = GenomicRanges::countOverlaps(query = peak_list[[i]], subject = reference)
    sample_in_reference = overlaps_count[overlaps_count != 0] %>% length()
    sample_peaks_in_ref = c(sample_peaks_in_ref, sample_in_reference)
  }
  
  return(sample_peaks_in_ref)
}


ENCODE_H3K27ac_overlapping_CT_peaks = count_overlaps(peakList, ENCODE_H3K27ac)
ENCODE_H3K27me3_overlapping_CT_peaks = count_overlaps(peakList, ENCODE_H3K27ac)
ENCODE_H3K27ac_overlapping_CT_peaks_withDup = count_overlaps(peakList_withDup, ENCODE_H3K27me3)
ENCODE_H3K27me3_overlapping_CT_peaks_withDup = count_overlaps(peakList_withDup, ENCODE_H3K27me3)


CT_in_ENCODE = data.frame(Sample = sampleList_all, CT_peaks_in_ENCODE_H3K27ac = ENCODE_H3K27ac_overlapping_CT_peaks, CT_peaks_in_ENCODE_H3K27me3 = ENCODE_H3K27me3_overlapping_CT_peaks,  CT_peaks_in_ENCODE_H3K27ac_withDup = ENCODE_H3K27ac_overlapping_CT_peaks_withDup, CT_peaks_in_ENCODE_H3K27me3_withDup = ENCODE_H3K27me3_overlapping_CT_peaks_withDup)
CT_in_ENCODE

Collect C&T peaks in ENCODE:

CT_peaks_in_ENCODE_H3K27ac_set = c()
CT_peaks_in_ENCODE_H3K27me3_set = c()

for(i in 1:length(peakList)){
  in_ENCODE_H3K27ac = IRanges::subsetByOverlaps(peakList[[i]], ENCODE_H3K27ac)
  in_ENCODE_H3K27me3 = IRanges::subsetByOverlaps(peakList[[i]], ENCODE_H3K27me3)
  CT_peaks_in_ENCODE_H3K27ac_set = c(CT_peaks_in_ENCODE_H3K27ac_set, in_ENCODE_H3K27ac)
  CT_peaks_in_ENCODE_H3K27me3_set = c(CT_peaks_in_ENCODE_H3K27me3_set, in_ENCODE_H3K27me3)
}

CT_peaks_in_ENCODE_H3K27ac_set = setNames(CT_peaks_in_ENCODE_H3K27ac_set, sampleList_all)
CT_peaks_in_ENCODE_H3K27me3_set = setNames(CT_peaks_in_ENCODE_H3K27me3_set, sampleList_all)

Collect C&T peaks not in ENCODE:

CT_peaks_not_in_ENCODE_H3K27ac_set = c()
CT_peaks_not_in_ENCODE_H3K27me3_set = c()

for(i in 1:length(peakList)){
  not_in_ENCODE_H3K27ac = IRanges::subsetByOverlaps(peakList[[i]], ENCODE_H3K27ac, invert = TRUE)
  not_in_ENCODE_H3K27me3 = IRanges::subsetByOverlaps(peakList[[i]], ENCODE_H3K27me3, invert = TRUE)
  CT_peaks_not_in_ENCODE_H3K27ac_set = c(CT_peaks_not_in_ENCODE_H3K27ac_set, not_in_ENCODE_H3K27ac)
  CT_peaks_not_in_ENCODE_H3K27me3_set = c(CT_peaks_not_in_ENCODE_H3K27me3_set, not_in_ENCODE_H3K27me3)
}

CT_peaks_not_in_ENCODE_H3K27ac_set = setNames(CT_peaks_not_in_ENCODE_H3K27ac_set, sampleList_all)
CT_peaks_not_in_ENCODE_H3K27me3_set = setNames(CT_peaks_not_in_ENCODE_H3K27me3_set, sampleList_all)

ENCODE peaks in C&T

count_overlaps_rev = function(peak_list, reference){
  
  ref_peaks_in_sample = c()
  
  for(i in 1:length(peak_list)){
    overlaps_count = GenomicRanges::countOverlaps(query = reference, subject = peak_list[[i]])
    reference_in_sample = overlaps_count[overlaps_count != 0] %>% length()
    ref_peaks_in_sample = c(ref_peaks_in_sample, reference_in_sample)
  }
  
  return(ref_peaks_in_sample)
}


captured_H3K27ac_ENCODE_list = count_overlaps_rev(reference = ENCODE_H3K27ac, peak_list = peakList)
captured_H3K27me3_ENCODE_list = count_overlaps_rev(reference = ENCODE_H3K27me3, peak_list = peakList)
captured_H3K27ac_ENCODE_list_withDup = count_overlaps_rev(reference = ENCODE_H3K27ac, peak_list = peakList_withDup)
captured_H3K27me3_ENCODE_list_withDup = count_overlaps_rev(reference = ENCODE_H3K27me3, peakList_withDup)

Total_captured_ENCODE_peaks = data.frame(Sample = sampleList_all, ENCODE_H3K27ac_captured_peaks = captured_H3K27ac_ENCODE_list, ENCODE_H3K27me3_captured_peaks = captured_H3K27me3_ENCODE_list, ENCODE_H3K27ac_captured_peaks_withDup = captured_H3K27ac_ENCODE_list_withDup, ENCODE_H3K27me3_captured_peaks_withDup = captured_H3K27me3_ENCODE_list_withDup)
Total_captured_ENCODE_peaks

Collect ENCODE peaks in C&T:

ENCODE_H3K27ac_peaks_in_CT_set = c()
ENCODE_H3K27me3_peaks_in_CT_set = c()

for(i in 1:length(peakList)){
  in_CT_H3K27ac = IRanges::subsetByOverlaps(ENCODE_H3K27ac, peakList[[i]])
  in_CT_H3K27me3 = IRanges::subsetByOverlaps(ENCODE_H3K27me3, peakList[[i]])
  ENCODE_H3K27ac_peaks_in_CT_set = c(ENCODE_H3K27ac_peaks_in_CT_set, in_CT_H3K27ac)
  ENCODE_H3K27me3_peaks_in_CT_set = c(ENCODE_H3K27me3_peaks_in_CT_set, in_CT_H3K27me3)
}

ENCODE_H3K27ac_peaks_in_CT_set = setNames(ENCODE_H3K27ac_peaks_in_CT_set, sampleList_all)
ENCODE_H3K27me3_peaks_in_CT_set = setNames(ENCODE_H3K27me3_peaks_in_CT_set, sampleList_all)

Collect ENCODE peaks not in C&T:

ENCODE_H3K27ac_peaks_not_in_CT_set = c()
ENCODE_H3K27me3_peaks_not_in_CT_set = c()

for(i in 1:length(peakList)){
  not_in_CT_H3K27ac = IRanges::subsetByOverlaps(ENCODE_H3K27ac, peakList[[i]], invert = TRUE)
  not_in_CT_H3K27me3 = IRanges::subsetByOverlaps(ENCODE_H3K27me3, peakList[[i]], invert = TRUE)
  ENCODE_H3K27ac_peaks_not_in_CT_set = c(ENCODE_H3K27ac_peaks_not_in_CT_set, not_in_CT_H3K27ac)
  ENCODE_H3K27me3_peaks_not_in_CT_set = c(ENCODE_H3K27me3_peaks_not_in_CT_set, not_in_CT_H3K27me3)
}

ENCODE_H3K27ac_peaks_not_in_CT_set = setNames(ENCODE_H3K27ac_peaks_not_in_CT_set, sampleList_all)
ENCODE_H3K27me3_peaks_not_in_CT_set = setNames(ENCODE_H3K27me3_peaks_not_in_CT_set, sampleList_all)

Summarize

ENCODE_overlaps = Total_captured_ENCODE_peaks %>% left_join(Total_CT_peaks, by = "Sample") %>% left_join(CT_in_ENCODE, by = "Sample")


ENCODE_overlaps$Percentage_of_CT_in_ENCODE_H3K27ac = ENCODE_overlaps$CT_peaks_in_ENCODE_H3K27ac / ENCODE_overlaps$Total_PeakN * 100
ENCODE_overlaps$Percentage_of_total_ENCODE_H3K27ac = ENCODE_overlaps$ENCODE_H3K27ac_captured_peaks / length(ENCODE_H3K27ac) * 100

ENCODE_overlaps$Percentage_of_CT_in_ENCODE_H3K27me3 = ENCODE_overlaps$CT_peaks_in_ENCODE_H3K27me3 / ENCODE_overlaps$Total_PeakN * 100
ENCODE_overlaps$Percentage_of_total_ENCODE_H3K27me3 = ENCODE_overlaps$ENCODE_H3K27me3_captured_peaks / length(ENCODE_H3K27me3) * 100


ENCODE_overlaps$Percentage_of_CT_in_ENCODE_H3K27ac_withDup = ENCODE_overlaps$CT_peaks_in_ENCODE_H3K27ac_withDup / ENCODE_overlaps$Total_PeakN_withDup * 100
ENCODE_overlaps$Percentage_of_total_ENCODE_H3K27ac_withDup = ENCODE_overlaps$ENCODE_H3K27ac_captured_peaks_withDup / length(ENCODE_H3K27ac) * 100

ENCODE_overlaps$Percentage_of_CT_in_ENCODE_H3K27me3_withDup = ENCODE_overlaps$CT_peaks_in_ENCODE_H3K27me3_withDup / ENCODE_overlaps$Total_PeakN_withDup * 100
ENCODE_overlaps$Percentage_of_total_ENCODE_H3K27me3_withDup = ENCODE_overlaps$ENCODE_H3K27me3_captured_peaks_withDup / length(ENCODE_H3K27me3) * 100


ENCODE_overlaps
ENCODE_overlaps_H3K27ac = ENCODE_overlaps[c("Sample", "Percentage_of_CT_in_ENCODE_H3K27ac", "Percentage_of_total_ENCODE_H3K27ac")]
Percentage_of_CT_in_ENCODE_H3K27ac_SEACR = ENCODE_overlaps_H3K27ac[c(seq(1, nrow(ENCODE_overlaps_H3K27ac), by = 2)),2] 
Percentage_of_CT_in_ENCODE_H3K27ac_MACS2 = ENCODE_overlaps_H3K27ac[c(seq(2, nrow(ENCODE_overlaps_H3K27ac), by = 2)),2]
Percentage_of_CT_in_ENCODE_H3K27ac_summary = data.frame(Sample = sampleList, Percentage_of_CT_in_ENCODE_H3K27ac_SEACR = Percentage_of_CT_in_ENCODE_H3K27ac_SEACR, Percentage_of_CT_in_ENCODE_H3K27ac_MACS2 = Percentage_of_CT_in_ENCODE_H3K27ac_MACS2)

Percentage_of_total_ENCODE_H3K27ac_SEACR = ENCODE_overlaps_H3K27ac[c(seq(1, nrow(ENCODE_overlaps_H3K27ac), by = 2)),3] 
Percentage_of_total_ENCODE_H3K27ac_MACS2 = ENCODE_overlaps_H3K27ac[c(seq(2, nrow(ENCODE_overlaps_H3K27ac), by = 2)),3]
Percentage_of_total_ENCODE_H3K27ac_summary = data.frame(Sample = sampleList, Percentage_of_total_ENCODE_H3K27ac_SEACR = Percentage_of_total_ENCODE_H3K27ac_SEACR, Percentage_of_total_ENCODE_H3K27ac_MACS2 = Percentage_of_total_ENCODE_H3K27ac_MACS2)


ENCODE_overlaps_H3K27me3 = ENCODE_overlaps[c("Sample", "Percentage_of_CT_in_ENCODE_H3K27me3", "Percentage_of_total_ENCODE_H3K27me3")]
Percentage_of_CT_in_ENCODE_H3K27me3_SEACR = ENCODE_overlaps_H3K27me3[c(seq(1, nrow(ENCODE_overlaps_H3K27me3), by = 2)),2] 
Percentage_of_CT_in_ENCODE_H3K27me3_MACS2 = ENCODE_overlaps_H3K27me3[c(seq(2, nrow(ENCODE_overlaps_H3K27me3), by = 2)),2]
Percentage_of_CT_in_ENCODE_H3K27me3_summary = data.frame(Sample = sampleList, Percentage_of_CT_in_ENCODE_H3K27me3_SEACR = Percentage_of_CT_in_ENCODE_H3K27me3_SEACR, Percentage_of_CT_in_ENCODE_H3K27me3_MACS2 = Percentage_of_CT_in_ENCODE_H3K27me3_MACS2)

Percentage_of_total_ENCODE_H3K27me3_SEACR = ENCODE_overlaps_H3K27me3[c(seq(1, nrow(ENCODE_overlaps_H3K27me3), by = 2)),3] 
Percentage_of_total_ENCODE_H3K27me3_MACS2 = ENCODE_overlaps_H3K27me3[c(seq(2, nrow(ENCODE_overlaps_H3K27me3), by = 2)),3]
Percentage_of_total_ENCODE_H3K27me3_summary = data.frame(Sample = sampleList, Percentage_of_total_ENCODE_H3K27me3_SEACR = Percentage_of_total_ENCODE_H3K27me3_SEACR, Percentage_of_total_ENCODE_H3K27me3_MACS2 = Percentage_of_total_ENCODE_H3K27me3_MACS2)


ENCODE_overlaps_H3K27ac_withDup = ENCODE_overlaps[c("Sample", "Percentage_of_CT_in_ENCODE_H3K27ac_withDup", "Percentage_of_total_ENCODE_H3K27ac_withDup")]
Percentage_of_CT_in_ENCODE_H3K27ac_withDup_SEACR = ENCODE_overlaps_H3K27ac_withDup[c(seq(1, nrow(ENCODE_overlaps_H3K27ac_withDup), by = 2)),2] 
Percentage_of_CT_in_ENCODE_H3K27ac_withDup_MACS2 = ENCODE_overlaps_H3K27ac_withDup[c(seq(2, nrow(ENCODE_overlaps_H3K27ac_withDup), by = 2)),2]
Percentage_of_CT_in_ENCODE_H3K27ac_withDup_summary = data.frame(Sample = sampleList, Percentage_of_CT_in_ENCODE_H3K27ac_withDup_SEACR = Percentage_of_CT_in_ENCODE_H3K27ac_withDup_SEACR, Percentage_of_CT_in_ENCODE_H3K27ac_withDup_MACS2 = Percentage_of_CT_in_ENCODE_H3K27ac_withDup_MACS2)

Percentage_of_total_ENCODE_H3K27ac_withDup_SEACR = ENCODE_overlaps_H3K27ac_withDup[c(seq(1, nrow(ENCODE_overlaps_H3K27ac_withDup), by = 2)),3] 
Percentage_of_total_ENCODE_H3K27ac_withDup_MACS2 = ENCODE_overlaps_H3K27ac_withDup[c(seq(2, nrow(ENCODE_overlaps_H3K27ac_withDup), by = 2)),3]
Percentage_of_total_ENCODE_H3K27ac_withDup_summary = data.frame(Sample = sampleList, Percentage_of_total_ENCODE_H3K27ac_withDup_SEACR = Percentage_of_total_ENCODE_H3K27ac_withDup_SEACR, Percentage_of_total_ENCODE_H3K27ac_withDup_MACS2 = Percentage_of_total_ENCODE_H3K27ac_withDup_MACS2)


ENCODE_overlaps_H3K27me3_withDup = ENCODE_overlaps[c("Sample", "Percentage_of_CT_in_ENCODE_H3K27me3_withDup", "Percentage_of_total_ENCODE_H3K27me3_withDup")]
Percentage_of_CT_in_ENCODE_H3K27me3_withDup_SEACR = ENCODE_overlaps_H3K27me3_withDup[c(seq(1, nrow(ENCODE_overlaps_H3K27me3_withDup), by = 2)),2] 
Percentage_of_CT_in_ENCODE_H3K27me3_withDup_MACS2 = ENCODE_overlaps_H3K27me3_withDup[c(seq(2, nrow(ENCODE_overlaps_H3K27me3_withDup), by = 2)),2]
Percentage_of_CT_in_ENCODE_H3K27me3_withDup_summary = data.frame(Sample = sampleList, Percentage_of_CT_in_ENCODE_H3K27me3_withDup_SEACR = Percentage_of_CT_in_ENCODE_H3K27me3_withDup_SEACR, Percentage_of_CT_in_ENCODE_H3K27me3_withDup_MACS2 = Percentage_of_CT_in_ENCODE_H3K27me3_withDup_MACS2)

Percentage_of_total_ENCODE_H3K27me3_withDup_SEACR = ENCODE_overlaps_H3K27me3_withDup[c(seq(1, nrow(ENCODE_overlaps_H3K27me3_withDup), by = 2)),3] 
Percentage_of_total_ENCODE_H3K27me3_withDup_MACS2 = ENCODE_overlaps_H3K27me3_withDup[c(seq(2, nrow(ENCODE_overlaps_H3K27me3_withDup), by = 2)),3]
Percentage_of_total_ENCODE_H3K27me3_withDup_summary = data.frame(Sample = sampleList, Percentage_of_total_ENCODE_H3K27me3_withDup_SEACR = Percentage_of_total_ENCODE_H3K27me3_withDup_SEACR, Percentage_of_total_ENCODE_H3K27me3_withDup_MACS2 = Percentage_of_total_ENCODE_H3K27me3_withDup_MACS2)

Precision/recall metrics

Precision: % of CUT&Tag peaks in ENCODE Recall: % of ENCODE peaks captured

f_measure = data.frame(Sample = sampleList_all)

f_measure$true_pos = lapply(ENCODE_H3K27ac_peaks_in_CT_set, length) %>% unname() %>% unlist()
f_measure$false_pos = lapply(CT_peaks_not_in_ENCODE_H3K27ac_set, length) %>% unname() %>% unlist()
f_measure$false_neg = lapply(ENCODE_H3K27ac_peaks_not_in_CT_set, length) %>% unname() %>% unlist()

f_measure$F1 = f_measure$true_pos/(f_measure$true_pos + 0.5*(f_measure$false_pos + f_measure$false_neg))
f_measure_F1_SEACR_list = f_measure[c(seq(1, nrow(f_measure), by = 2)), 5] 
f_measure_F1_MACS2_list = f_measure[c(seq(2, nrow(f_measure), by = 2)), 5]

f_measure_F1 = data.frame(Sample = sampleList, SEACR_F1 = f_measure_F1_SEACR_list, MACS2_F1 = f_measure_F1_MACS2_list)
f_measure_F1_melt = melt(f_measure_F1, id.vars = "Sample")

F1_plot = ggplot(f_measure_F1_melt, aes(Sample, value)) +
  geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
  theme_bw(base_size = 15) +
  ylab("F1 score") +
  xlab("") +
  ggtitle("F-measures of precision and recall (ENCODE H3K27ac)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "bottom") +
  scale_fill_discrete(name = "Peak caller", labels = c("SEACR", "MACS2"))

F1_plot

C&T peaks in ENCODE

H3K27ac

Percentage_of_CT_in_ENCODE_H3K27ac_summary_melt = melt(Percentage_of_CT_in_ENCODE_H3K27ac_summary, id.vars = "Sample")

Peaks_in_ENCODE_plot = ggplot(Percentage_of_CT_in_ENCODE_H3K27ac_summary_melt, aes(Sample, value)) +
  theme_bw() +
  geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
  ylim(0, 100) +
  ylab("% of peaks in ENCODE") +
  xlab("") +
  ggtitle("Proportion of CUT&Tag peaks falling into ENCODE H3K27ac") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(axis.text = element_text(size = 15)) +
  theme(legend.position = "bottom") +
  scale_fill_discrete(name = "Peak caller", labels = c("SEACR", "MACS2")) +
  theme(plot.margin = unit(c(1,1,1,1), "cm"))

Peaks_in_ENCODE_plot

Percentage_of_CT_in_ENCODE_H3K27ac_withDup_summary_melt = melt(Percentage_of_CT_in_ENCODE_H3K27ac_withDup_summary, id.vars = "Sample")

Peaks_in_ENCODE_plot_withDup = ggplot(Percentage_of_CT_in_ENCODE_H3K27ac_withDup_summary_melt, aes(Sample, value)) +
  theme_bw() +
  geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
  ylim(0, 100) +
  ylab("% of peaks in ENCODE") +
  xlab("") +
  ggtitle("Proportion of CUT&Tag peaks falling into ENCODE H3K27ac (with duplicates)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "bottom") +
  scale_fill_discrete(name = "Peak caller", labels = c("SEACR", "MACS2")) +
  theme(plot.margin = unit(c(1,1,1,1), "cm"))

Peaks_in_ENCODE_plot_withDup

q values
master_qvals_df = c()

for(i in 1:length(peakList)){
  df_captured = data.frame("Sample" = sampleList_all[i], "-log10(qval)" = ENCODE_H3K27ac_peaks_in_CT_set[[i]]$V9, "Captured" = "Captured", check.names = FALSE)
  df_missed = data.frame("Sample" = sampleList_all[i], "-log10(qval)" = ENCODE_H3K27ac_peaks_not_in_CT_set[[i]]$V9, "Captured" = "Missed", check.names = FALSE)
  master_qvals_df = rbind(master_qvals_df, rbind(df_captured, df_missed))
}

master_qvals_df_melt = melt(master_qvals_df, id.vars = c("Sample", "Captured"))
ggplot(master_qvals_df_melt, aes(x = value, y = Sample, fill = Captured)) + 
  theme_bw() +
  stat_boxplot(geom = "errorbar") + 
  geom_boxplot(outlier.shape = NA) +
  coord_cartesian(xlim = c(1, 250)) +
  ylab("") +
  xlab("-log10(q-val)") +
  labs(fill = "ENCODE H3K27ac peaks") +
  theme(legend.position = "bottom") +
  theme(text = element_text(size = 25))

tests = list()

for(i in 1:length(sampleList_all)){
  captured = master_qvals_df[(master_qvals_df$Sample == sampleList_all[i] & master_qvals_df$Captured == "Captured"),]
  missed = master_qvals_df[(master_qvals_df$Sample == sampleList_all[i] & master_qvals_df$Captured == "Missed"),]
  var_test = var.test(captured$`-log10(qval)`, missed$`-log10(qval)`)
  var = ifelse(var_test$p.value < 0.05, FALSE, TRUE)
  test = t.test(captured$`-log10(qval)`, missed$`-log10(qval)`, var.equal = var, paired = FALSE, alternative = c("two.sided"))
  #tests[i] = ifelse(test$p.value < 0.05, "significant", "insignificant")
  tests[[i]] = test
  }

names(tests) = sampleList_all
tests
## $Diagenode_C15410196_1_50_SEACR
## 
##  Welch Two Sample t-test
## 
## data:  captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 83.354, df = 21985, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  47.59428 49.88656
## sample estimates:
## mean of x mean of y 
##  65.92364  17.18322 
## 
## 
## $Diagenode_C15410196_1_50_MACS2
## 
##  Welch Two Sample t-test
## 
## data:  captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 93.556, df = 18945, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  56.95664 59.39429
## sample estimates:
## mean of x mean of y 
##  73.89533  15.71987 
## 
## 
## $CST9733_1_100_H3K27me3_SEACR
## 
##  Welch Two Sample t-test
## 
## data:  captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 6.0819, df = 942.89, p-value = 1.724e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  11.36836 22.20002
## sample estimates:
## mean of x mean of y 
##  51.93975  35.15557 
## 
## 
## $CST9733_1_100_H3K27me3_MACS2
## 
##  Welch Two Sample t-test
## 
## data:  captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 10.153, df = 1374.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  18.73792 27.71304
## sample estimates:
## mean of x mean of y 
##  58.07635  34.85088

H3K27me3

Percentage_of_CT_in_ENCODE_H3K27me3_summary_melt = melt(Percentage_of_CT_in_ENCODE_H3K27me3_summary, id.vars = "Sample")

Peaks_in_ENCODE_H3K27me3_plot = ggplot(Percentage_of_CT_in_ENCODE_H3K27me3_summary_melt, aes(Sample, value)) +
  theme_bw() +
  geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
  ylim(0, 100) +
  ylab("% of peaks in ENCODE") +
  xlab("") +
  ggtitle("Proportion of CUT&Tag peaks falling into ENCODE H3K27me3") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "bottom") +
  scale_fill_discrete(name = "Peak caller", labels = c("SEACR", "MACS2")) +
  theme(plot.margin = unit(c(1,1,1,1), "cm"))

Peaks_in_ENCODE_H3K27me3_plot

Percentage_of_CT_in_ENCODE_H3K27me3_withDup_summary_melt = melt(Percentage_of_CT_in_ENCODE_H3K27me3_withDup_summary, id.vars = "Sample")

Peaks_in_ENCODE_H3K27me3_plot_withDup = ggplot(Percentage_of_CT_in_ENCODE_H3K27me3_withDup_summary_melt, aes(Sample, value)) +
  theme_bw() +
  geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
  ylim(0, 100) +
  ylab("% of peaks in ENCODE") +
  xlab("") +
  ggtitle("Proportion of CUT&Tag peaks falling into ENCODE H3K27me3 (with duplicates)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(axis.text = element_text(size = 15)) +
  theme(legend.position = "bottom") +
  scale_fill_discrete(name = "Peak caller", labels = c("SEACR", "MACS2")) +
  theme(plot.margin = unit(c(1,1,1,1), "cm"))

Peaks_in_ENCODE_H3K27me3_plot_withDup

ENCODE peaks in C&T

H3K27ac

Percentage_of_total_ENCODE_H3K27ac_summary_melt = melt(Percentage_of_total_ENCODE_H3K27ac_summary, id.vars = "Sample")

ENCODE_in_CT_plot = ggplot(Percentage_of_total_ENCODE_H3K27ac_summary_melt, aes(Sample, value)) +
  theme_bw(base_size = 15) +
  geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
  ylim(0, 100) +
  ylab("% of ENCODE peaks captured") +
  xlab("Sample") +
  ggtitle("Proportion of ENCODE H3K27ac peaks captured by CUT&Tag") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "bottom") +
  scale_fill_discrete(name = "Peak caller", labels = c("SEACR", "MACS2")) 

ENCODE_in_CT_plot

Percentage_of_total_ENCODE_H3K27ac_withDup_summary_melt = melt(Percentage_of_total_ENCODE_H3K27ac_withDup_summary, id.vars = "Sample")

ENCODE_in_CT_plot_withDup = ggplot(Percentage_of_total_ENCODE_H3K27ac_withDup_summary_melt, aes(Sample, value)) +
    theme_bw(base_size = 15) +
  geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
  ylim(0, 100) +
  ylab("% of ENCODE peaks captured") +
  xlab("Sample") +
  ggtitle("Proportion of ENCODE H3K27ac peaks captured by CUT&Tag (with duplicates)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "bottom") +
  scale_fill_discrete(name = "Peak caller", labels = c("SEACR", "MACS2")) 

ENCODE_in_CT_plot_withDup

H3K27me3

Percentage_of_total_ENCODE_H3K27me3_summary_melt = melt(Percentage_of_total_ENCODE_H3K27me3_summary, id.vars = "Sample")

ENCODE_in_CT_H3K27me3_plot = ggplot(Percentage_of_total_ENCODE_H3K27me3_summary_melt, aes(Sample, value)) +
  theme_bw(base_size = 15) +
  geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
  ylim(0, 100) +
  ylab("% of ENCODE peaks captured") +
  xlab("") +
  ggtitle("Proportion of ENCODE H3K27me3 peaks captured by CUT&Tag") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "bottom") +
  scale_fill_discrete(name = "Peak caller", labels = c("SEACR", "MACS2"))

ENCODE_in_CT_H3K27me3_plot

Percentage_of_total_ENCODE_H3K27me3_withDup_summary_melt = melt(Percentage_of_total_ENCODE_H3K27me3_withDup_summary, id.vars = "Sample")

ENCODE_in_CT_H3K27me3_plot_withDup = ggplot(Percentage_of_total_ENCODE_H3K27me3_withDup_summary_melt, aes(Sample, value)) +
  theme_bw(base_size = 15) +
  geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
  ylim(0, 100) +
  ylab("% of ENCODE peaks captured") +
  xlab("") +
  ggtitle("Proportion of ENCODE H3K27me3 peaks captured by CUT&Tag (with duplicates)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "bottom") +
  scale_fill_discrete(name = "Peak caller", labels = c("SEACR", "MACS2"))

ENCODE_in_CT_H3K27me3_plot_withDup

Heatmaps

Heatmaps over transcription units

All samples scaled to 2 million reads

Heatmap_hg19

Heatmaps over peaks

All samples scaled to 2 million total reads

SEACR

MACS2

Annotation

ChromHMM

chrHMM_url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeBroadHmm/wgEncodeBroadHmmK562HMM.bed.gz"
#chrHMM_url = "https://www.encodeproject.org/files/ENCFF163QUM/@@download/ENCFF163QUM.bed.gz"

chrHMM = genomation::readBed(chrHMM_url)
## Rows: 2 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): X1, X4, X6
## dbl (5): X2, X3, X5, X7, X8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
chrHMM_list = GenomicRanges::split(chrHMM, chrHMM$name, drop = TRUE) 

chrHMM_names = c("Transcription elongation [10]", "Weak transcription [11]", "Repressed [12]", "Heterochromatin [13]", "Repetitive [14]", "Repetitive [15]", "Active promoter [1]", "Weak promoter [2]", "Poised promoter [3]", "Strong enhancer [4]", "Strong enhancer [5]",  "Weak enhancer [6]", "Weak enhancer [7]", "Insulator [8]", "Transcription transition [9]")
names(chrHMM_list) = chrHMM_names

All samples

annotation = genomation::annotateWithFeatures(GenomicRanges::GRangesList(peakList_with_ENCODE), chrHMM_list)
matrix = genomation::heatTargetAnnotation(annotation, plot = FALSE, precedence = FALSE)

rownames(matrix) = sampleList_all_with_ENCODE
colnames(matrix) = chrHMM_names
matrix_melt = melt(matrix)
colnames(matrix_melt) = c("Sample", "State", "% of peaks")

ggplot(matrix_melt) +
  theme_minimal() +
  geom_tile(aes(x = State, y = Sample, fill = `% of peaks`)) +
  ylab("") +
  scale_fill_viridis(option = "plasma") +
  ggpubr::rotate_x_text(angle = 45) +
  theme(text = element_text(size = 15)) +
  theme(legend.key.size = unit(1, "cm")) 

Duplicate-only peaks

Obtain duplicate-only peaks:

duplicate_only_peaks = c()

for(i in 1:length(peakList)){
  dup_only = IRanges::subsetByOverlaps(peakList_withDup[[i]], peakList[[i]], invert = TRUE)
  duplicate_only_peaks = c(duplicate_only_peaks, dup_only)
}

names(duplicate_only_peaks) = names(peakList)
annotation_dup_only = genomation::annotateWithFeatures(GenomicRanges::GRangesList(duplicate_only_peaks), chrHMM_list)
matrix = genomation::heatTargetAnnotation(annotation_dup_only, plot = FALSE)

rownames(matrix) = sampleList_all
colnames(matrix) = chrHMM_names
matrix_melt = melt(matrix)
colnames(matrix_melt) = c("Sample", "State", "% of peaks")

ggplot(matrix_melt) +
  theme_minimal() +
  geom_tile(aes(x = State, y = Sample, fill = `% of peaks`)) +
  ylab("") +
  scale_fill_viridis(option = "plasma") +
  ggpubr::rotate_x_text(angle = 45) +
  theme(text = element_text(size = 15)) +
  theme(legend.key.size = unit(1, "cm")) 

C&T peaks in ENCODE H3K27ac

annotation_in_ENCODE_H3K27ac = genomation::annotateWithFeatures(GenomicRanges::GRangesList(CT_peaks_in_ENCODE_H3K27ac_set), chrHMM_list)

matrix = genomation::heatTargetAnnotation(annotation_in_ENCODE_H3K27ac, plot = FALSE)

rownames(matrix) = sampleList_all
colnames(matrix) = chrHMM_names
matrix_melt = melt(matrix)
colnames(matrix_melt) = c("Sample", "State", "% of peaks")

ggplot(matrix_melt) +
  theme_minimal() +
  geom_tile(aes(x = State, y = Sample, fill = `% of peaks`)) +
  ylab("") +
  scale_fill_viridis(option = "plasma") +
  ggpubr::rotate_x_text(angle = 45) +
  theme(text = element_text(size = 15)) +
  theme(legend.key.size = unit(1, "cm")) 

C&T peaks not in ENCODE H3K27ac

annotation_not_in_ENCODE_H3K27ac = genomation::annotateWithFeatures(GenomicRanges::GRangesList(CT_peaks_not_in_ENCODE_H3K27ac_set), chrHMM_list)

matrix = genomation::heatTargetAnnotation(annotation_not_in_ENCODE_H3K27ac, plot = FALSE)

rownames(matrix) = sampleList_all
colnames(matrix) = chrHMM_names
matrix_melt = melt(matrix)
colnames(matrix_melt) = c("Sample", "State", "% of peaks")

ggplot(matrix_melt) +
  theme_minimal() +
  geom_tile(aes(x = State, y = Sample, fill = `% of peaks`)) +
  ylab("") +
  scale_fill_viridis(option = "plasma") +
  ggpubr::rotate_x_text(angle = 45) +
  theme(text = element_text(size = 15)) +
  theme(legend.key.size = unit(1, "cm")) 

ENCODE H3K27ac peaks in C&T

annotation_in_CT_H3K27ac = genomation::annotateWithFeatures(GenomicRanges::GRangesList(ENCODE_H3K27ac_peaks_in_CT_set), chrHMM_list)

matrix = genomation::heatTargetAnnotation(annotation_in_CT_H3K27ac, plot = FALSE)

colnames(matrix) = chrHMM_names
matrix_melt = melt(matrix)
colnames(matrix_melt) = c("Sample", "State", "% of peaks")

ggplot(matrix_melt) +
  theme_minimal() +
  geom_tile(aes(x = State, y = Sample, fill = `% of peaks`)) +
  ylab("") +
  scale_fill_viridis(option = "plasma") +
  ggpubr::rotate_x_text(angle = 45) +
  theme(text = element_text(size = 15)) +
  theme(legend.key.size = unit(1, "cm")) 

ENCODE H3K27ac peaks not in C&T

annotation_not_in_CT_H3K27ac = genomation::annotateWithFeatures(GenomicRanges::GRangesList(ENCODE_H3K27ac_peaks_not_in_CT_set), chrHMM_list)

matrix = genomation::heatTargetAnnotation(annotation_not_in_CT_H3K27ac, plot = FALSE)

colnames(matrix) = chrHMM_names
matrix_melt = melt(matrix)
colnames(matrix_melt) = c("Sample", "State", "% of peaks")

ggplot(matrix_melt) +
  theme_minimal() +
  geom_tile(aes(x = State, y = Sample, fill = `% of peaks`)) +
  ylab("") +
  scale_fill_viridis(option = "plasma") +
  ggpubr::rotate_x_text(angle = 45) +
  theme(text = element_text(size = 15)) +
  theme(legend.key.size = unit(1, "cm")) 

SEACR vs MACS2 peaks

# use original peaks (unfiltered)
SEACR_peakList = peakList_unfilt[seq(1, length(peakList), by = 2)]
MACS2_peakList = peakList_unfilt[seq(2, length(peakList), by = 2)]
SEACR_only_peaks = list()
MACS2_only_peaks = list()

for(i in 1:length(SEACR_peakList)){
  SEACR_only_peaks[[i]] = IRanges::subsetByOverlaps(SEACR_peakList[[i]], MACS2_peakList[[i]], invert = TRUE)
  MACS2_only_peaks[[i]] = IRanges::subsetByOverlaps(MACS2_peakList[[i]], SEACR_peakList[[i]], invert = TRUE)
}

names(SEACR_only_peaks) = sampleList
names(MACS2_only_peaks) = sampleList
SEACR_only_peaks_total = c()
SEACR_only_peaks_in_ENCODE = c()
MACS2_only_peaks_total = c()
MACS2_only_peaks_in_ENCODE = c()

for(i in 1:length(SEACR_only_peaks)){
  SEACR_only_peaks_in_ENCODE[i] = IRanges::subsetByOverlaps(SEACR_only_peaks[[i]], ENCODE_H3K27ac) %>% length()
  SEACR_only_peaks_total[i] = length(SEACR_only_peaks[[i]])
  MACS2_only_peaks_in_ENCODE[i] = IRanges::subsetByOverlaps(MACS2_only_peaks[[i]], ENCODE_H3K27ac) %>% length()
  MACS2_only_peaks_total[i] = length(MACS2_only_peaks[[i]])
}

caller_specific_peaks = data.frame(Sample = names(SEACR_peakList), SEACR_only_peaks_in_ENCODE = SEACR_only_peaks_in_ENCODE, SEACR_only_peaks_total = SEACR_only_peaks_total, MACS2_only_peaks_in_ENCODE = MACS2_only_peaks_in_ENCODE, MACS2_only_peaks_total = MACS2_only_peaks_total)

caller_specific_peaks$SEACR_only_peaks_in_ENCODE_percentage = caller_specific_peaks$SEACR_only_peaks_in_ENCODE / caller_specific_peaks$SEACR_only_peaks_total * 100

caller_specific_peaks$MACS2_only_peaks_in_ENCODE_percentage = caller_specific_peaks$MACS2_only_peaks_in_ENCODE / caller_specific_peaks$MACS2_only_peaks_total * 100

caller_specific_peaks
SEACR
annotation = genomation::annotateWithFeatures(GenomicRanges::GRangesList(SEACR_only_peaks), chrHMM_list)
matrix = genomation::heatTargetAnnotation(annotation, plot = FALSE)

rownames(matrix) = names(SEACR_only_peaks)
colnames(matrix) = chrHMM_names
matrix_melt = melt(matrix)
colnames(matrix_melt) = c("Sample", "State", "% of peaks")

ggplot(matrix_melt) +
  theme_minimal() +
  geom_tile(aes(x = State, y = Sample, fill = `% of peaks`)) +
  ylab("") +
  scale_fill_viridis(option = "plasma") +
  ggpubr::rotate_x_text(angle = 45) +
  theme(text = element_text(size = 25)) +
  theme(legend.key.size = unit(1, "cm")) 

MACS2
annotation = genomation::annotateWithFeatures(GenomicRanges::GRangesList(MACS2_only_peaks), chrHMM_list)
matrix = genomation::heatTargetAnnotation(annotation, plot = FALSE)

rownames(matrix) = names(MACS2_only_peaks)
colnames(matrix) = chrHMM_names
matrix_melt = melt(matrix)
colnames(matrix_melt) = c("Sample", "State", "% of peaks")

ggplot(matrix_melt) +
  theme_minimal() +
  geom_tile(aes(x = State, y = Sample, fill = `% of peaks`)) +
  ylab("") +
  scale_fill_viridis(option = "plasma") +
  ggpubr::rotate_x_text(angle = 45) +
  theme(text = element_text(size = 25)) +
  theme(legend.key.size = unit(1, "cm")) 

ChIPseeker

peakAnnoList = lapply(peakList_with_ENCODE, annotatePeak, TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene, tssRegion = c(-3000, 3000), verbose = FALSE)

annobar = ChIPseeker::plotAnnoBar(peakAnnoList)
annobar

Enrichment analysis

clusterProfiler

KEGG

peakAnnoList = lapply(peakList_with_ENCODE, annotatePeak, TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene, tssRegion = c(-3000, 3000), verbose = FALSE)

genes = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
names(genes) = sub("_", "\n", names(genes))
compKEGG = clusterProfiler::compareCluster(geneCluster = genes,
                                           fun = "enrichKEGG",
                                           pvalueCutoff = 0.05,
                                           pAdjustMethod = "BH")

KEGG_plot = clusterProfiler::dotplot(compKEGG, showCategory = 10, title = "KEGG Pathway Enrichment Analysis") +
  theme_bw(base_size = 13) +
  ggpubr::rotate_x_text(angle = 45) +
  scale_size_area(max_size = 10)

KEGG_plot

GO

peakAnnoList = lapply(peakList_with_ENCODE, annotatePeak, TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene, tssRegion = c(-3000, 3000), verbose = FALSE)

genes = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
names(genes) = sub("_", "\n", names(genes))
compGO = clusterProfiler::compareCluster(geneCluster = genes,
                                           OrgDb = "org.Hs.eg.db", 
                                           fun = "enrichGO",
                                           pvalueCutoff = 0.05,
                                           pAdjustMethod = "BH")

GO_plot = clusterProfiler::dotplot(compGO, showCategory = 10, title = "GO Enrichment Analysis") +
  theme_bw(base_size = 13) +
  ggpubr::rotate_x_text(angle = 45) +
  scale_size_area(max_size = 10)

GO_plot

ReactomePA

pathways = list()

for(peak in peakList_with_ENCODE){
  gene = ChIPseeker::seq2gene(peak, tssRegion = c(-3000, 3000), flankDistance = 3000, TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene)
  pathway = ReactomePA::enrichPathway(gene)
  pathways = c(pathways, pathway)
}

names(pathways) = names(peakList_with_ENCODE)
pathways = list()
geneList = list()

for(i in 1:length(peakList_with_ENCODE)){
  gene = ChIPseeker::seq2gene(peakList_with_ENCODE[[i]], tssRegion = c(-3000, 3000), flankDistance = 3000, TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene)
  geneList[[i]] = gene
  pathway = ReactomePA::enrichPathway(gene)
  pathways[[i]] = pathway
}

names(pathways) = names(peakList_with_ENCODE)
names(geneList) = names(peakList_with_ENCODE)
ReactomePA_comp = clusterProfiler::compareCluster(geneList, fun = "enrichPathway")

clusterProfiler::dotplot(ReactomePA_comp, showCategory = 10) +
  theme_bw(base_size = 13) +
  ggtitle("Reactome Pathway Analysis") +
  ggpubr::rotate_x_text(angle = 45) +
  scale_size_area(max_size = 10)

Motifs

motifs = list()

for(i in 1:length(sampleList_all_with_ENCODE)){
  motif = marge::read_known_results(path = paste0("/Volumes/test_data/motifs/", sampleList_all_with_ENCODE[i]), homer_dir = TRUE)
  motifs[[i]] = motif[order(-motif$log_p_value), ]
}

motifs = setNames(motifs, sampleList_all_with_ENCODE)

#in some cases top -log(p) values are infinite (especially for ENCODE H3K27ac), so will set them to the maximum -log(p-value) attained by other samples
for(i in 1:length(sampleList_all_with_ENCODE)){
  motifs[[i]]$log_p_value[(motifs[[i]]$log_p_value == "Inf")] = 300
}

By motif family

All motifs

ATAC-seq reads in captured vs missed ENCODE peaks

ATAC-seq libraries: ENCLB918NXF, ENCLB758GEG from https://www.encodeproject.org/experiments/ENCSR483RKN/

# did not evaluate
bamFile = "/Volumes/test_data/resources/ATAC/bam/K562_ATACseq_R1.mLb.clN.sorted.bam"

total_ATAC_reads = 49677798

ATAC_in_captured_ENCODE_inPeakData = c()
for(i in 1:length(sampleList_all)){
  total_bases = sum(width(ENCODE_H3K27ac_peaks_in_CT_set[[i]]))
  peak = data.frame(ENCODE_H3K27ac_peaks_in_CT_set[[i]])
  peak.gr = GenomicRanges::GRanges(seqnames = peak$seqnames, IRanges(start = peak$start, end = peak$end), strand = "*")
  fragment_counts = chromVAR::getCounts(bamFile, peak.gr, paired = FALSE, by_rg = FALSE, format = "bam")
  ATAC_in_captured_ENCODE_inPeakN = counts(fragment_counts)[,1] %>% sum
  ATAC_in_captured_ENCODE_inPeakData = rbind(ATAC_in_captured_ENCODE_inPeakData, data.frame(Sample = sampleList_all[i], ATAC_reads_in_captured_peaks = ATAC_in_captured_ENCODE_inPeakN, Total_bases = total_bases))
}

# adjust FRiPs for total coverage of ENCODE peak set and multiplied by constant 
ATAC_in_captured_ENCODE_inPeakData$measure_ATAC_in_captured_ENCODE = ATAC_in_captured_ENCODE_inPeakData$ATAC_reads_in_captured_peaks/total_ATAC_reads * (10^9/ATAC_in_captured_ENCODE_inPeakData$Total_bases)
ATAC_in_captured_ENCODE_inPeakData
# did not evaluate
ATAC_in_missed_ENCODE_inPeakData = c()
for(i in 1:length(sampleList_all)){
  total_bases = sum(width(ENCODE_H3K27ac_peaks_not_in_CT_set[[i]]))
  peak = data.frame(ENCODE_H3K27ac_peaks_not_in_CT_set[[i]])
  peak.gr = GenomicRanges::GRanges(seqnames = peak$seqnames, IRanges(start = peak$start, end = peak$end), strand = "*")
  fragment_counts = chromVAR::getCounts(bamFile, peak.gr, paired = FALSE, by_rg = FALSE, format = "bam")
  ATAC_in_missed_ENCODE_inPeakN = counts(fragment_counts)[,1] %>% sum
  ATAC_in_missed_ENCODE_inPeakData = rbind(ATAC_in_missed_ENCODE_inPeakData, data.frame(Sample = sampleList_all[i], ATAC_reads_in_missed_peaks = ATAC_in_missed_ENCODE_inPeakN, Total_bases = total_bases))
}

# adjusted FRiPs for total coverage of ENCODE peak set and multiplied by constant 
ATAC_in_missed_ENCODE_inPeakData$measure_ATAC_in_missed_ENCODE = ATAC_in_missed_ENCODE_inPeakData$ATAC_reads_in_missed_peaks/total_ATAC_reads * (10^9/ATAC_in_missed_ENCODE_inPeakData$Total_bases)
ATAC_in_missed_ENCODE_inPeakData
# did not evaluate
ATAC_reads_in_ENCODE_summary = data.frame(Sample = labels_SEACR_MACS2, ATAC_in_captured_ENCODE = ATAC_in_captured_ENCODE_inPeakData$measure_ATAC_in_captured_ENCODE, ATAC_in_missed_ENCODE = ATAC_in_missed_ENCODE_inPeakData$measure_ATAC_in_missed_ENCODE)

ATAC_reads_in_ENCODE_summary_melt = melt(ATAC_reads_in_ENCODE_summary, id.vars = "Sample")

ATAC_reads_in_ENCODE_plot = ggplot(ATAC_reads_in_ENCODE_summary_melt, aes(Sample, value)) +
  geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
  theme_bw() +
  ylab("ATAC reads in ENCODE (adj %)") +
  xlab("") +
  ggtitle("ATAC reads in ENCODE H3K27ac peaks captured vs missed by CUT&Tag") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "bottom") +
  scale_fill_discrete(name = "", labels = c("Reads in captured ENCODE peaks", "Reads in missed ENCODE peaks"))

ATAC_reads_in_ENCODE_plot

STARR-seq

#STARRseq_rep1 = ChIPseeker::readPeakFile(paste0(dir, "/STARRseq_peaks_hg38_ENCFF717VJK.bed"), as = "GRanges") 
#STARRseq_rep2 = ChIPseeker::readPeakFile(paste0(dir, "/STARRseq_peaks_hg38_ENCFF394DBM.bed"), as = "GRanges") 
#STARRseq_hg38 = IRanges::subsetByOverlaps(STARRseq_rep1, STARRseq_rep2)

chainfile = rtracklayer::import.chain(system.file(package = "liftOver", "extdata", "hg38ToHg19.over.chain"))
#seqlevelsStyle(STARRseq_hg38) = "UCSC" 
#STARRseq = rtracklayer::liftOver(STARRseq_hg38, chainfile) %>% unlist() 
#write.table(data.frame(STARRseq), file = paste0(datadir, "/resources/STARRseq/STARRseq_hg19_replicated_peaks_ENCSR858MPS.bed"), sep = "\t", col.names = FALSE, row.names = FALSE)
STARRseq = ChIPseeker::readPeakFile(paste0(datadir, "/resources/STARRseq/STARRseq_hg19_replicated_peaks_ENCSR858MPS.bed"), as = "GRanges")


dir = "/Users/leylaabbasova/Documents/CUTnTag/analysis/STARRseq"
ENCODE_DNase = ChIPseeker::readPeakFile(paste0(dir, "/DNase_seq_hg38_ENCFF722NSO.bed"), as = "GRanges")
seqlevelsStyle(ENCODE_DNase) = "UCSC" 
ENCODE_DNase = rtracklayer::liftOver(ENCODE_DNase, chainfile) %>% unlist()

Filter peaks

STARRseq_filt = IRanges::subsetByOverlaps(STARRseq, ENCODE_DNase)

Test overlaps

STARRseq_overlapping_peaks = c()
STARRseq_overlap_prop = c()
STARRseq_missed_peaks = c()
STARRseq_filt_overlapping_peaks = c()
STARRseq_filt_overlap_prop = c()
STARRseq_filt_missed_peaks = c()

for(i in 1:length(peakList_with_ENCODE)){
  STARRseq_overlapping_peaks[[i]] = IRanges::subsetByOverlaps(STARRseq, peakList_with_ENCODE[[i]])
  STARRseq_missed_peaks[[i]] = IRanges::subsetByOverlaps(STARRseq, peakList_with_ENCODE[[i]], invert = TRUE)
  STARRseq_overlap_prop = c(STARRseq_overlap_prop, length(STARRseq_overlapping_peaks[[i]])/length(STARRseq))
  
  STARRseq_filt_overlapping_peaks[[i]] = IRanges::subsetByOverlaps(STARRseq_filt, peakList_with_ENCODE[[i]])
  STARRseq_filt_missed_peaks[[i]] = IRanges::subsetByOverlaps(STARRseq_filt, peakList_with_ENCODE[[i]], invert = TRUE)
  STARRseq_filt_overlap_prop = c(STARRseq_filt_overlap_prop, length(STARRseq_filt_overlapping_peaks[[i]])/length(STARRseq_filt))
}


names(STARRseq_overlapping_peaks) = sampleList_all_with_ENCODE
names(STARRseq_overlap_prop) = sampleList_all_with_ENCODE
names(STARRseq_missed_peaks) = sampleList_all_with_ENCODE

names(STARRseq_filt_overlapping_peaks) = sampleList_all_with_ENCODE
names(STARRseq_filt_overlap_prop) = sampleList_all_with_ENCODE
names(STARRseq_filt_missed_peaks) = sampleList_all_with_ENCODE

STARRseq_peak_overlaps_df = data.frame("Sample" = sampleList_all_with_ENCODE,
                                       "% captured STARR-seq peaks (all)" = (STARRseq_overlap_prop * 100),
                                       "% captured STARR-seq peaks (filt)" = (STARRseq_filt_overlap_prop * 100),
                                       "Total sample peaks" = lapply(peakList_with_ENCODE, length) %>% unlist %>% unname,
                                       check.names = FALSE, row.names = NULL)
STARRseq_peak_overlaps_df

Compare STARR-seq peaks

All

Check -log10(qval) values of captured and missed STARR-seq peaks:

STARRseq_qvals_df = c()

for(i in 1:length(peakList_with_ENCODE)){
  df_captured = data.frame("Sample" = sampleList_all_with_ENCODE[i], "-log10(qval)" = STARRseq_overlapping_peaks[[i]]$V12, "Captured" = "Captured", check.names = FALSE)
  df_missed = data.frame("Sample" = sampleList_all_with_ENCODE[i], "-log10(qval)" = STARRseq_missed_peaks[[i]]$V12, "Captured" = "Missed", check.names = FALSE)
  STARRseq_qvals_df = rbind(STARRseq_qvals_df, rbind(df_captured, df_missed))
}
STARRseq_qvals_df_melt = melt(STARRseq_qvals_df, id.vars = c("Sample", "Captured"))
ggplot(STARRseq_qvals_df_melt, aes(x = value, y = Sample, fill = Captured)) + 
  theme_bw() +
  stat_boxplot(geom = "errorbar", coef = 1.5) + 
  geom_boxplot(outlier.shape = NA) +
  coord_cartesian(xlim = c(1, 10)) +
  scale_x_continuous(breaks = seq(1, 10, by = 1)) +
  ylab("") +
  xlab("-log10(qval)") +
  labs(fill = "STARR-seq peaks") +
  theme(legend.position = "bottom") +
  theme(text = element_text(size = 15))

tests = c()

for(i in 1:length(sampleList_all_with_ENCODE)){
  captured = STARRseq_qvals_df[(STARRseq_qvals_df$Sample == sampleList_all_with_ENCODE[i] & STARRseq_qvals_df$Captured == "Captured"),]
  missed = STARRseq_qvals_df[(STARRseq_qvals_df$Sample == sampleList_all_with_ENCODE[i] & STARRseq_qvals_df$Captured == "Missed"),]
  var_test = var.test(captured$`-log10(qval)`, missed$`-log10(qval)`)
  var = ifelse(var_test$p.value < 0.05, FALSE, TRUE)
  test = t.test(captured$`-log10(qval)`, missed$`-log10(qval)`, var.equal = var, paired = FALSE, alternative = c("two.sided"))
  tests[[i]] = test
}

names(tests) = sampleList_all_with_ENCODE
tests
## $Diagenode_C15410196_1_50_SEACR
## 
##  Welch Two Sample t-test
## 
## data:  captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 18.18, df = 2011, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.8831205 1.0966932
## sample estimates:
## mean of x mean of y 
##  3.239952  2.250045 
## 
## 
## $Diagenode_C15410196_1_50_MACS2
## 
##  Welch Two Sample t-test
## 
## data:  captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 19.091, df = 2155.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.8915945 1.0957315
## sample estimates:
## mean of x mean of y 
##  3.237487  2.243824 
## 
## 
## $CST9733_1_100_H3K27me3_SEACR
## 
##  Welch Two Sample t-test
## 
## data:  captured$`-log10(qval)` and missed$`-log10(qval)`
## t = -9.0224, df = 1729.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.3584189 -0.2304155
## sample estimates:
## mean of x mean of y 
##  2.065323  2.359740 
## 
## 
## $CST9733_1_100_H3K27me3_MACS2
## 
##  Welch Two Sample t-test
## 
## data:  captured$`-log10(qval)` and missed$`-log10(qval)`
## t = -9.0034, df = 1838.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.3551120 -0.2280743
## sample estimates:
## mean of x mean of y 
##  2.069078  2.360671 
## 
## 
## $ENCODE_H3K27ac
## 
##  Welch Two Sample t-test
## 
## data:  captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 24.894, df = 5019.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.775484 0.908063
## sample estimates:
## mean of x mean of y 
##  3.009353  2.167580 
## 
## 
## $ENCODE_H3K27me3
## 
##  Welch Two Sample t-test
## 
## data:  captured$`-log10(qval)` and missed$`-log10(qval)`
## t = -5.2025, df = 9194.3, p-value = 2.008e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.16923144 -0.07660483
## sample estimates:
## mean of x mean of y 
##  2.248104  2.371022

Filtered

STARRseq_filt_qvals_df = c()

STARRseq_filt_overlapping_peaks_cut = STARRseq_filt_overlapping_peaks[names(STARRseq_filt_overlapping_peaks) != "ENCODE DNase"]
STARRseq_filt_missed_peaks_cut = STARRseq_filt_missed_peaks[names(STARRseq_filt_missed_peaks) != "ENCODE DNase"]

for(i in 1:length(STARRseq_filt_overlapping_peaks_cut)){
  df_captured = data.frame("Sample" = names(STARRseq_filt_overlapping_peaks_cut)[i], "-log10(qval)" = STARRseq_filt_overlapping_peaks_cut[[i]]$V12, "Captured" = "Captured", check.names = FALSE)
  df_missed = data.frame("Sample" = names(STARRseq_filt_missed_peaks_cut)[i], "-log10(qval)" = STARRseq_filt_missed_peaks_cut[[i]]$V12, "Captured" = "Missed", check.names = FALSE)
  STARRseq_filt_qvals_df = rbind(STARRseq_filt_qvals_df, rbind(df_captured, df_missed))
}
STARRseq_filt_qvals_df_melt = melt(STARRseq_filt_qvals_df, id.vars = c("Sample", "Captured"))
ggplot(STARRseq_filt_qvals_df_melt, aes(x = value, y = Sample, fill = Captured)) + 
  theme_bw() +
  stat_boxplot(geom = "errorbar", coef = 1.5) + 
  geom_boxplot(outlier.shape = NA) +
  coord_cartesian(xlim = c(1, 10)) +
  scale_x_continuous(breaks = seq(1, 10, by = 1)) +
  scale_y_discrete(limits = rev(levels(STARRseq_filt_qvals_df_melt$Sample))) +
  ylab("") +
  xlab("-log10(qval)") +
  labs(fill = "STARR-seq peaks") +
  theme(legend.position = "bottom") +
  theme(text = element_text(size = 18))

tests = c()

for(i in 1:length(STARRseq_filt_overlapping_peaks_cut)){
  captured = STARRseq_filt_qvals_df[(STARRseq_filt_qvals_df$Sample == names(STARRseq_filt_overlapping_peaks_cut)[i] & STARRseq_filt_qvals_df$Captured == "Captured"),]
  missed = STARRseq_filt_qvals_df[(STARRseq_filt_qvals_df$Sample == names(STARRseq_filt_overlapping_peaks_cut)[i] & STARRseq_filt_qvals_df$Captured == "Missed"),]
  var_test = var.test(captured$`-log10(qval)`, missed$`-log10(qval)`)
  var = ifelse(var_test$p.value < 0.05, FALSE, TRUE)
  test = t.test(captured$`-log10(qval)`, missed$`-log10(qval)`, var.equal = var)
  tests[[i]] = test
}

names(tests) = names(STARRseq_filt_overlapping_peaks_cut)
tests
## $Diagenode_C15410196_1_50_SEACR
## 
##  Welch Two Sample t-test
## 
## data:  captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 11.289, df = 2385.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.5626159 0.7991626
## sample estimates:
## mean of x mean of y 
##  3.303851  2.622962 
## 
## 
## $Diagenode_C15410196_1_50_MACS2
## 
##  Welch Two Sample t-test
## 
## data:  captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 11.857, df = 2595.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.5765521 0.8050417
## sample estimates:
## mean of x mean of y 
##  3.301657  2.610861 
## 
## 
## $CST9733_1_100_H3K27me3_SEACR
## 
##  Welch Two Sample t-test
## 
## data:  captured$`-log10(qval)` and missed$`-log10(qval)`
## t = -4.6656, df = 441.88, p-value = 4.084e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.5336144 -0.2172999
## sample estimates:
## mean of x mean of y 
##  2.427459  2.802916 
## 
## 
## $CST9733_1_100_H3K27me3_MACS2
## 
##  Welch Two Sample t-test
## 
## data:  captured$`-log10(qval)` and missed$`-log10(qval)`
## t = -4.1878, df = 501.5, p-value = 3.328e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.4947802 -0.1787796
## sample estimates:
## mean of x mean of y 
##  2.466447  2.803227 
## 
## 
## $ENCODE_H3K27ac
## 
##  Welch Two Sample t-test
## 
## data:  captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 13.588, df = 7322.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.4956927 0.6628267
## sample estimates:
## mean of x mean of y 
##  3.051485  2.472225 
## 
## 
## $ENCODE_H3K27me3
## 
##  Welch Two Sample t-test
## 
## data:  captured$`-log10(qval)` and missed$`-log10(qval)`
## t = -3.7013, df = 3652.9, p-value = 0.0002177
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.27563262 -0.08473987
## sample estimates:
## mean of x mean of y 
##  2.651407  2.831593

Subsample

Peaks

Randomly sample 6000 peaks from each sample and compare overlaps:

peakList_sub = c()

for(i in 1:length(peakList_with_ENCODE)){
  peakList_sub[[i]] = peakList_with_ENCODE[[i]][sample(length(peakList_with_ENCODE[[i]]), 6000), ]
}

names(peakList_sub) = names(peakList_with_ENCODE)
sub_STARRseq_peak_overlaps_df = EpiCompare::overlap_percent(STARRseq, peakList_sub)
sub_STARRseq_peak_overlaps_df["Percentage (filt)"] = (EpiCompare::overlap_percent(STARRseq_filt, peakList_sub))$Percentage %>% round(3)
sub_STARRseq_peak_overlaps_df["Mb"] = lapply(peakList_sub, function(x){sum(GenomicRanges::width(x))/10^6}) %>% unlist() %>% round(2)
sub_STARRseq_peak_overlaps_df

Coverage

Keep genome coverage the same:

test_coverage = function(peaks){sum(GenomicRanges::width(peaks)/10^6) %>% unlist() %>% round(0)}
peakList_sub_cov = c()
range_start = 500
# minimum coverage is 8Mb if also using ENCODE DNase peaks in comparison, otherwise can set higher limit
Mb = 15


for(i in 1:length(peakList_with_ENCODE)){
  #print(paste("Peak", i))
  
  # if included, use all DNase peaks
  if(names(peakList_with_ENCODE)[i] == "ENCODE DNase"){
    range = length(peakList_with_ENCODE$`ENCODE DNase`)
  } else {
    range_start = ((Mb - 2)/test_coverage(peakList_with_ENCODE[[i]]) * length(peakList_with_ENCODE[[i]])) %>% round(0)
    range = range_start:length(peakList_with_ENCODE[[i]])
  }
  
  # test lower bound of range
  test_peaks = peakList_with_ENCODE[[i]][sample(length(peakList_with_ENCODE[[i]]), range[1]), ]
  test_cov = test_peaks %>% test_coverage()

  if(test_cov == Mb){
    peakList_sub_cov[[i]] = test_peaks
  }
  
  if(test_cov < Mb){
    for(x in range){
      sub_peaks = peakList_with_ENCODE[[i]][sample(length(peakList_with_ENCODE[[i]]), x), ]
      coverage = test_coverage(sub_peaks)
      if(coverage == Mb){
        peakList_sub_cov[[i]] = sub_peaks
        break
      } 
    }
  }
  
  if(test_cov > Mb){ 
    print(paste("Adjusting range for peak", i))  
    range_start = range_start - 500
    range = range_start:length(peakList_with_ENCODE[[i]])
    }
  }

names(peakList_sub_cov) = names(peakList_with_ENCODE)
sub_cov_STARRseq_peak_overlaps_df = EpiCompare::overlap_percent(STARRseq, peakList_sub_cov)
sub_cov_STARRseq_peak_overlaps_df["Percentage (filt)"] = (EpiCompare::overlap_percent(STARRseq_filt, peakList_sub_cov))$Percentage %>% round(3)
sub_cov_STARRseq_peak_overlaps_df["Mb"] = lapply(peakList_sub_cov, function(x){sum(GenomicRanges::width(reduce(x)))/10^6}) %>% unlist() %>% round(2)
sub_cov_STARRseq_peak_overlaps_df

Summarise

STARRseq_peak_overlaps_df_cut = STARRseq_peak_overlaps_df[STARRseq_peak_overlaps_df$Sample %in% rownames(sub_STARRseq_peak_overlaps_df), ]
STARRseq_peak_overlaps_df
STARRseq_peak_overlaps_df_cut
sub_STARRseq_peak_overlaps_df
sub_cov_STARRseq_peak_overlaps_df
sub_STARRseq_peak_overlaps_df = sub_STARRseq_peak_overlaps_df[match(STARRseq_peak_overlaps_df_cut$Sample, rownames(sub_STARRseq_peak_overlaps_df)),]
sub_cov_STARRseq_peak_overlaps_df = sub_cov_STARRseq_peak_overlaps_df[match(STARRseq_peak_overlaps_df_cut$Sample, rownames(sub_cov_STARRseq_peak_overlaps_df)),]


STARRseq_capture_summary = data.frame("Sample" = STARRseq_peak_overlaps_df_cut$Sample, "Uncorrected" = STARRseq_peak_overlaps_df_cut$`% captured STARR-seq peaks (all)`, "Corrected for genomic coverage" = sub_cov_STARRseq_peak_overlaps_df$Percentage, check.names = FALSE)

STARRseq_capture_summary_filt = data.frame("Sample" = STARRseq_peak_overlaps_df_cut$Sample, "Uncorrected" = STARRseq_peak_overlaps_df_cut$`% captured STARR-seq peaks (filt)`, "Corrected for genomic coverage" = sub_cov_STARRseq_peak_overlaps_df$`Percentage (filt)`, check.names = FALSE)


STARRseq_capture_summary
STARRseq_capture_summary_filt
STARRseq_capture_summary_melt = melt(STARRseq_capture_summary)
## Using Sample as id variables
STARRseq_capture_summary_melt$Sample = factor(STARRseq_capture_summary_melt$Sample, levels = unique(STARRseq_capture_summary_melt$Sample))

ggplot(STARRseq_capture_summary_melt, aes(y = Sample, x = value, fill = variable)) +
  theme_bw() +
  geom_bar(stat = "identity", width = 0.75, position = position_dodge(width = 0.75), color = "black", size = 0.2) +
  scale_y_discrete(limits = rev(levels(STARRseq_capture_summary_melt$Sample))) +
  ylab("") +
  #xlim(0,40) +
  xlab("% of STARR-seq peaks captured") +
  theme(legend.position = "bottom", legend.title = element_blank()) +
  guides(fill = guide_legend(ncol = 2, reverse = FALSE)) +
  theme(text = element_text(size = 12))  

STARRseq_capture_summary_filt_melt = melt(STARRseq_capture_summary_filt)
## Using Sample as id variables
STARRseq_capture_summary_filt_melt$Sample = factor(STARRseq_capture_summary_filt_melt$Sample, levels = unique(STARRseq_capture_summary_filt_melt$Sample))

ggplot(STARRseq_capture_summary_filt_melt, aes(y = Sample, x = value, fill = variable)) +
  theme_bw() +
  geom_bar(stat = "identity", width = 0.75, position = position_dodge(width = 0.75), color = "black", size = 0.2) +
  scale_y_discrete(limits = rev(levels(STARRseq_capture_summary_filt_melt$Sample))) +
  ylab("") +
  #xlim(0,40) +
  xlab("% of STARR-seq peaks captured") +
  theme(legend.position = "bottom", legend.title = element_blank()) +
  guides(fill = guide_legend(ncol = 2, reverse = FALSE)) +
  theme(text = element_text(size = 12)) 

Session info

sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] DiffBind_3.6.2                         
##  [2] SummarizedExperiment_1.26.1            
##  [3] MatrixGenerics_1.8.0                   
##  [4] matrixStats_0.62.0                     
##  [5] marge_0.0.4.9999                       
##  [6] genomation_1.28.0                      
##  [7] org.Hs.eg.db_3.15.0                    
##  [8] BSgenome.Hsapiens.UCSC.hg19_1.4.3      
##  [9] BSgenome_1.64.0                        
## [10] Biostrings_2.64.0                      
## [11] XVector_0.36.0                         
## [12] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [13] GenomicFeatures_1.48.3                 
## [14] AnnotationDbi_1.58.0                   
## [15] Biobase_2.56.0                         
## [16] BRGenomics_1.8.0                       
## [17] clusterProfiler_4.4.2                  
## [18] rtracklayer_1.56.0                     
## [19] ReactomePA_1.40.0                      
## [20] ChIPseeker_1.32.0                      
## [21] chromVAR_1.18.0                        
## [22] GenomicRanges_1.48.0                   
## [23] GenomeInfoDb_1.32.2                    
## [24] IRanges_2.30.0                         
## [25] S4Vectors_0.34.0                       
## [26] BiocGenerics_0.42.0                    
## [27] corrplot_0.92                          
## [28] viridis_0.6.2                          
## [29] viridisLite_0.4.0                      
## [30] gplots_3.1.3                           
## [31] ggbreak_0.1.0                          
## [32] ggpubr_0.4.0                           
## [33] ggplot2_3.3.6                          
## [34] reshape2_1.4.4                         
## [35] stringr_1.4.0                          
## [36] dplyr_1.0.9                            
## 
## loaded via a namespace (and not attached):
##   [1] rappdirs_0.3.3                R.methodsS3_1.8.1            
##   [3] coda_0.19-4                   tidyr_1.2.0                  
##   [5] bit64_4.0.5                   knitr_1.39                   
##   [7] irlba_2.3.5                   DelayedArray_0.22.0          
##   [9] R.utils_2.11.0                hwriter_1.3.2.1              
##  [11] data.table_1.14.2             KEGGREST_1.36.0              
##  [13] TFBSTools_1.34.0              RCurl_1.98-1.7               
##  [15] generics_0.1.2                preprocessCore_1.58.0        
##  [17] cowplot_1.1.1                 RSQLite_2.2.14               
##  [19] shadowtext_0.1.2              bit_4.0.4                    
##  [21] tzdb_0.3.0                    enrichplot_1.16.1            
##  [23] xml2_1.3.3                    httpuv_1.6.5                 
##  [25] assertthat_0.2.1              DirichletMultinomial_1.38.0  
##  [27] apeglm_1.18.0                 amap_0.8-18                  
##  [29] xfun_0.31                     hms_1.1.1                    
##  [31] jquerylib_0.1.4               evaluate_0.15                
##  [33] promises_1.2.0.1              fansi_1.0.3                  
##  [35] restfulr_0.0.14               progress_1.2.2               
##  [37] caTools_1.18.2                dbplyr_2.2.0                 
##  [39] igraph_1.3.1                  DBI_1.1.2                    
##  [41] geneplotter_1.74.0            htmlwidgets_1.5.4            
##  [43] purrr_0.3.4                   ellipsis_0.3.2               
##  [45] backports_1.4.1               annotate_1.74.0              
##  [47] gridBase_0.4-7                biomaRt_2.52.0               
##  [49] vctrs_0.4.1                   abind_1.4-5                  
##  [51] cachem_1.0.6                  withr_2.5.0                  
##  [53] ggforce_0.3.3                 vroom_1.5.7                  
##  [55] bdsmatrix_1.3-6               GenomicAlignments_1.32.0     
##  [57] treeio_1.20.0                 prettyunits_1.1.1            
##  [59] DOSE_3.22.0                   ape_5.6-2                    
##  [61] lazyeval_0.2.2                seqLogo_1.62.0               
##  [63] crayon_1.5.1                  genefilter_1.78.0            
##  [65] labeling_0.4.2                pkgconfig_2.0.3              
##  [67] tweenr_1.0.2                  nlme_3.1-157                 
##  [69] rlang_1.0.2                   lifecycle_1.0.1              
##  [71] miniUI_0.1.1.1                downloader_0.4               
##  [73] filelock_1.0.2                BiocFileCache_2.4.0          
##  [75] seqPattern_1.28.0             AnnotationHub_3.4.0          
##  [77] invgamma_1.1                  polyclip_1.10-0              
##  [79] graph_1.74.0                  Matrix_1.4-1                 
##  [81] aplot_0.1.6                   ashr_2.2-54                  
##  [83] carData_3.0-5                 boot_1.3-28                  
##  [85] png_0.1-7                     rjson_0.2.21                 
##  [87] bitops_1.0-7                  R.oo_1.24.0                  
##  [89] KernSmooth_2.23-20            blob_1.2.3                   
##  [91] mixsqp_0.3-43                 SQUAREM_2021.1               
##  [93] qvalue_2.28.0                 ShortRead_1.54.0             
##  [95] jpeg_0.1-9                    readr_2.1.2                  
##  [97] rstatix_0.7.0                 gridGraphics_0.5-1           
##  [99] ggsignif_0.6.3                CNEr_1.32.0                  
## [101] reactome.db_1.81.0            scales_1.2.0                 
## [103] memoise_2.0.1                 graphite_1.42.0              
## [105] magrittr_2.0.3                plyr_1.8.7                   
## [107] zlibbioc_1.42.0               compiler_4.2.0               
## [109] scatterpie_0.1.7              bbmle_1.0.25                 
## [111] BiocIO_1.6.0                  RColorBrewer_1.1-3           
## [113] plotrix_3.8-2                 DESeq2_1.36.0                
## [115] Rsamtools_2.12.0              cli_3.3.0                    
## [117] systemPipeR_2.2.2             patchwork_1.1.1              
## [119] MASS_7.3-57                   tidyselect_1.1.2             
## [121] stringi_1.7.6                 highr_0.9                    
## [123] emdbook_1.3.12                yaml_2.3.5                   
## [125] GOSemSim_2.22.0               locfit_1.5-9.5               
## [127] latticeExtra_0.6-29           ggrepel_0.9.1                
## [129] sass_0.4.1                    fastmatch_1.1-3              
## [131] tools_4.2.0                   parallel_4.2.0               
## [133] rstudioapi_0.13               TFMPvalue_0.0.8              
## [135] gridExtra_2.3                 plyranges_1.16.0             
## [137] farver_2.1.0                  ggraph_2.0.5                 
## [139] BiocManager_1.30.18           digest_0.6.29                
## [141] shiny_1.7.1                   pracma_2.3.8                 
## [143] Rcpp_1.0.8.3                  car_3.0-13                   
## [145] broom_0.8.0                   BiocVersion_3.15.2           
## [147] later_1.3.0                   httr_1.4.3                   
## [149] colorspace_2.0-3              XML_3.99-0.9                 
## [151] truncnorm_1.0-8               splines_4.2.0                
## [153] yulab.utils_0.0.4             tidytree_0.3.9               
## [155] EpiCompare_1.0.0              graphlayouts_0.8.0           
## [157] ggplotify_0.1.0               plotly_4.10.0                
## [159] xtable_1.8-4                  jsonlite_1.8.0               
## [161] ggtree_3.4.0                  poweRlaw_0.70.6              
## [163] tidygraph_1.2.1               UpSetR_1.4.0                 
## [165] ggfun_0.0.6                   R6_2.5.1                     
## [167] pillar_1.7.0                  htmltools_0.5.2              
## [169] mime_0.12                     glue_1.6.2                   
## [171] fastmap_1.1.0                 DT_0.23                      
## [173] BiocParallel_1.30.3           interactiveDisplayBase_1.34.0
## [175] codetools_0.2-18              fgsea_1.22.0                 
## [177] GreyListChIP_1.28.1           mvtnorm_1.1-3                
## [179] utf8_1.2.2                    lattice_0.20-45              
## [181] bslib_0.3.1                   tibble_3.1.7                 
## [183] numDeriv_2016.8-1.1           curl_4.3.2                   
## [185] gtools_3.9.2.1                GO.db_3.15.0                 
## [187] survival_3.3-1                limma_3.52.1                 
## [189] rmarkdown_2.14                munsell_0.5.0                
## [191] DO.db_2.9                     GenomeInfoDbData_1.2.8       
## [193] impute_1.70.0                 gtable_0.3.0